{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest for Spam\n",
    "### (make sure to have run featurize_spam.py)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(1)\n",
    "\n",
    "import scipy.io\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "from decisiontree import *\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import sys\n",
    "sys.setrecursionlimit(4000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3336, 31) (3336,) (5857, 31)\n"
     ]
    }
   ],
   "source": [
    "spam_path = \"datasets/spam-dataset/spam_data.mat\"\n",
    "data = scipy.io.loadmat(spam_path)\n",
    "x = data[\"training_data\"]\n",
    "x = StandardScaler().fit_transform(x)\n",
    "pca = PCA(n_components=0.50, whiten=True)\n",
    "pca_fit = pca.fit(x)\n",
    "x = pca_fit.transform(x)\n",
    "\n",
    "y = np.squeeze(data[\"training_labels\"])\n",
    "x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)\n",
    "x_test = pca_fit.transform(data[\"test_data\"])\n",
    "print(x.shape,y.shape,x_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-03-27 02:42:27,990\tWARNING worker.py:1406 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.\n",
      "2019-03-27 02:42:27,993\tINFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-03-27_02-42-27_58638/logs.\n",
      "2019-03-27 02:42:28,104\tINFO services.py:363 -- Waiting for redis server at 127.0.0.1:57498 to respond...\n",
      "2019-03-27 02:42:28,217\tINFO services.py:363 -- Waiting for redis server at 127.0.0.1:47475 to respond...\n",
      "2019-03-27 02:42:28,221\tINFO services.py:760 -- Starting Redis shard with 3.44 GB max memory.\n",
      "2019-03-27 02:42:28,233\tINFO services.py:1384 -- Starting the Plasma object store with 5.15 GB memory using /tmp.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.2 s, sys: 2.89 s, total: 12.1 s\n",
      "Wall time: 6min 47s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "rf = RandomForest(num_trees=25,header=\"spam_randomforest\")\n",
    "rf.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training accuracy: 0.5472263868065967\n",
      "validation accuracy: 0.5344311377245509\n",
      "saved predictions\n",
      "CPU times: user 2.78 s, sys: 183 ms, total: 2.97 s\n",
      "Wall time: 3.01 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_preds = rf.predict(x_train,\"train\")\n",
    "print(\"training accuracy:\",np.mean(train_preds==y_train))\n",
    "val_preds = rf.predict(x_val,\"val\")\n",
    "print(\"validation accuracy:\",np.mean(val_preds==y_val))\n",
    "rf.predict(x_test,\"test\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
